'''
Correlation calculation between two goals for the reward function: effort spent and dropout response.
Author: Meng Zhang
Date: January 2024


Input: all_states.csv
Output: print out correlations such as kohen's kappa, spearmans, and pearsons.

'''

import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import cohen_kappa_score

#  load the data
states_df = pd.read_csv("all_states.csv")

# mean imputation for empty entries
states_df = states_df.fillna(states_df.mean())

# calculate cohen's kappa
# effort_list = states_df["effort"].to_list()
effort_list = states_df["effort"].astype(int).to_numpy()
dropout_list = states_df["dropout_response"].astype(int).to_numpy()
cohens = cohen_kappa_score(effort_list, dropout_list)
print('cohen\'s kappa  for standardization: %.3f' % cohens)

# normalize data: using z_score, standardization x_std = x - μ / σ
cols_to_norm = ['effort', 'dropout_response']
norm_df = pd.DataFrame()
norm_df = states_df[cols_to_norm]

scaled_df = pd.DataFrame()
scaler = StandardScaler()
scaler.fit(norm_df)
scaled = scaler.fit_transform(norm_df)
scaled_df = pd.DataFrame(scaled, columns=cols_to_norm)

# normalize with abs scaling to fit it into [-1, 1]
scaler_abs = MaxAbsScaler()
scaler_abs.fit(norm_df)
scaled_abs= scaler.fit_transform(norm_df)
scaled_abs_df = pd.DataFrame(scaled_abs, columns=cols_to_norm)

# find out correlation
effort_list = scaled_df['effort'].astype(float).to_list()
dropout_list = scaled_df['dropout_response'].astype(float).to_list()
covariance = np.cov(effort_list, dropout_list)
#print(covariance)

# calculate Pearson's correlation
corr, _ = pearsonr(effort_list, dropout_list)
print('Pearsons correlation for standardization: %.3f' % corr)

# calculate spearman's correlation
corr_spearman, _ = spearmanr(effort_list, dropout_list)
print('Spearmans correlation for standardization: %.3f' % corr_spearman)

# max abs scaler
effort_list_abs = scaled_abs_df['effort'].astype(float).to_list()
dropout_list_abs = scaled_abs_df['dropout_response'].astype(float).to_list()
covariance_abs = np.cov(effort_list_abs, dropout_list_abs)

# calculate Pearson's correlation
corr_abs, _ = pearsonr(effort_list_abs, dropout_list_abs)
print('Pearsons correlation for max abs scaler: %.3f' % corr_abs)

# calculate spearman's correlation
corr_spearman_abs, _ = spearmanr(effort_list_abs, dropout_list_abs)
print('Spearmans correlation for max abs scaler: %.3f' % corr_spearman_abs)


